author_w_year <- author_paper_affiliations %>%
left_join(author_metadata) %>%
select(paperid, authorid, authorsequencenumber, year_first_paper)
## Joining, by = "authorid"
papers_w_age <- papers %>%
# only take papers with OA status for now
filter(!is.na(is_oa)) %>%
left_join(author_w_year) %>%
mutate(current_age = year - year_first_paper)
## Joining, by = "paperid"
age_dist <- papers_w_age %>%
filter(year == 2018) %>%
distinct(authorid, current_age) %>%
count(current_age) %>%
collect()
age_dist %>%
ggplot(aes(current_age, n)) +
geom_col() +
scale_y_continuous(labels = function(x) format(x,
big.mark = ".",
decimal.mark = ",")) +
labs(y = "Number of researchers", title = "Current age of researchers in 2018",
caption = "Only taking authors on papers that have an OA status into account")
age_trunc <- papers_w_age %>%
filter(current_age <= 50 | current_age > 0) %>%
distinct(year, paperid, is_oa, oa_status, current_age, authorsequencenumber)
# make age groups
age_oa <- age_trunc %>%
mutate(age_group = case_when(
current_age > 0 & current_age < 10 ~ "1-9 years",
current_age >= 10 & current_age < 20 ~ "10-19 years",
current_age >= 20 & current_age < 30 ~ "20-39 years",
current_age >= 30 & current_age < 40 ~ "30-39 years",
current_age >= 40 & current_age < 50 ~ "40-49 years",
current_age >= 50 & current_age < 60 ~ "50-59 years",
TRUE ~ "60+ years"
)) %>%
group_by(year, age_group) %>%
summarise(oa_share = mean(as.numeric(is_oa))) %>%
collect()
age_oa %>%
ggplot(aes(as_year(year), oa_share, colour = age_group, group = age_group)) +
geom_point() +
geom_line() +
scale_y_continuous(labels = function(x) scales::percent(x, accuracy = 1)) +
theme_bw() +
labs(x = NULL, y = "OA share", title = "OA share over time", colour = NULL)
Issue: the current perspective is agnostic to the timeline: citations are aggregated towards each author. To make a claim about whether being prestigious makes you publish OA we would need to know the citations prior to publication. but this data is not available in MAG. It could be generated from the raw graph: counting the links to each paper per year. But probably out of scope for now.
# get the ratio of OA publications by author
author_oa <- author_paper_affiliations %>%
left_join(papers) %>%
# restrict to unpaywall set
filter(!is.na(is_oa)) %>%
group_by(authorid) %>%
summarise(oa_pubs = sum(as.numeric(is_oa))) %>%
left_join(author_metadata, .)
## Joining, by = "paperid"
## Joining, by = "authorid"
author_oa <- author_oa %>%
select(authorid, papercount, year_first_paper, n_citations_norm,
n_unique_co_authors, mean_co_authors, oa_pubs) %>%
mutate(oa_share = oa_pubs / papercount)
author_oa %>%
filter(papercount >= 5) %>%
summarise(oa_cit = cor(oa_share, n_citations_norm),
oa_n_pap = cor(oa_share, papercount),
oa_u_co_auth = cor(oa_share, n_unique_co_authors),
oa_m_co_autho = cor(oa_share, mean_co_authors),
oa_age = cor(oa_share, year_first_paper)) %>%
collect() %>%
knitr::kable()
| oa_cit | oa_n_pap | oa_u_co_auth | oa_m_co_autho | oa_age |
|---|---|---|---|---|
| -0.0491943 | -0.1140835 | -0.0125805 | 0.0135733 | 0.2664612 |
Two smallish effects:
Citations and co-authors have no effect on this global level.
This should be further broken down into disciplines, and if somehow possible across time.
Important note: the analysis of OA on the individual level will likely come from the paper perspective: factors influencing whether a given paper is OA. So different than the QSS paper, which looks at which author characteristics are associated with more OA publications.
Difficulty: time perspective. All variables would need to have a time-wise aspect, i.e. should relate to before the paper was published.
leiden_small_local <- leiden %>%
filter(Field == "All sciences", Frac_counting == 1) %>%
select(University, Country, Period, P_top10, PP_top10, impact_P, P_OA,
PP_OA) %>%
collect() %>%
mutate(across(c(P_top10, PP_top10, impact_P, P_OA, PP_OA), as.numeric),
last_year_of_period = str_extract(Period, "\\d{4}$"))
affil_leiden_key <- read_csv2(
here::here("data/leiden matching/matching_leiden.csv")
)
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## affiliationid = col_double(),
## normalizedname = col_character(),
## displayname = col_character(),
## university_normalized = col_character(),
## University = col_character(),
## Country = col_character(),
## university_normalized_leiden = col_character()
## )
What is the question here? could simply take leiden ranking to investigate connex between prestige and oa publication. Interesting could be cross level effects: are effects of age and gender (if they exist) different for more prestiguous institutions?
maybe do this connex between prestige and OA first, then dig into further stuff BUT: special question here is the contribution towards the SDG stuff, and maybe within that the OA share.
maybe also more interesting to look into the type of OA publication, not simply oa yes or no.
papers_per_affiliation_per_sdg <- papers %>%
left_join(author_paper_affiliations) %>%
left_join(affils) %>%
group_by(authorid, paperid) %>%
mutate(frac_value = 1/n()) %>%
group_by(affiliationid, year, SDG_label) %>%
summarise(n_frac_papers = sum(frac_value, na.rm = TRUE),
n_frac_oa_papers = sum(frac_value * as.numeric(is_oa), na.rm = TRUE),
n_frac_citations = sum(frac_value * citations_norm, na.rm = TRUE)) %>%
collect()
## Joining, by = "paperid"
## Joining, by = c("citationcount", "affiliationid")
papers_per_affiliation_per_w_leiden <- papers_per_affiliation_per_sdg %>%
mutate(affiliationid = as.numeric(affiliationid)) %>% # needed for merging
left_join(affil_leiden_key) %>%
left_join(leiden_small_local)
## Joining, by = "affiliationid"
## Joining, by = c("University", "Country")
affil_oa <- papers_per_affiliation_per_w_leiden %>%
mutate(match_period = year == last_year_of_period) %>%
filter(match_period) %>%
mutate(oa_share = n_frac_oa_papers / n_frac_papers)
affil_oa %>%
group_by(year) %>%
filter(n_frac_papers > 50) %>%
summarise(cor = cor(oa_share, PP_OA, use = "pairwise.complete.obs")) %>%
knitr::kable()
| year | cor |
|---|---|
| 2009 | 0.6684618 |
| 2010 | 0.6678632 |
| 2011 | 0.6931630 |
| 2012 | 0.6374927 |
| 2013 | 0.7103419 |
| 2014 | 0.6956760 |
| 2015 | 0.7043756 |
| 2016 | 0.7502294 |
| 2017 | 0.7753870 |
| 2018 | 0.7962919 |
This gives the overall picture.
affil_oa %>%
filter(year == 2018, n_frac_papers > 50) %>%
ggplot(aes(PP_OA, oa_share)) +
geom_point() +
labs(x = "OA share (leiden)", y = "OA share SDG", caption = "Data for 2018")
Now by SDG
affil_oa %>%
group_by(year, SDG_label) %>%
filter(n_frac_papers > 50) %>%
summarise(cor = cor(oa_share, PP_OA, use = "pairwise.complete.obs")) %>%
ggplot(aes(year, cor, colour = SDG_label)) +
geom_line() +
geom_point() +
labs(y = "Correlation between OA shares in SDG & Leiden", x = NULL,
colour = NULL)
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
The association between overall OA level publications and SDG OA level is rising, in particular for SDG 13.
affil_oa %>%
group_by(year, SDG_label) %>%
filter(n_frac_papers > 50) %>%
summarise(cor = cor(oa_share, P_top10, use = "pairwise.complete.obs")) %>%
ggplot(aes(year, cor, colour = SDG_label)) +
geom_line() +
geom_point() +
scale_y_continuous(limits = c(0, .6)) +
labs(y = "Correlation between OA shares in SDG & P_top10", x = NULL,
colour = NULL)
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
affil_oa %>%
group_by(year, SDG_label) %>%
filter(n_frac_papers > 50) %>%
summarise(cor = cor(oa_share, PP_top10, use = "pairwise.complete.obs")) %>%
ggplot(aes(year, cor, colour = SDG_label)) +
geom_line() +
geom_point() +
scale_y_continuous(limits = c(0, .6)) +
labs(y = "Correlation between OA shares in SDG & PP_top10", x = NULL,
colour = NULL)
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
Very interesting: while in 2008, the prestige of a university (measured both in absolute (P_top10) and relative (PP_top10) terms was associated with the share of OA publications within SDG, this association has been declining. Points to a proliferation of OA? This might be in opposition to our expectations, i.e. when we expected that OA is only feasible for well-resourced actors?
Look at this in terms of types of OA, and with percentiles of the leiden ranks.
pdata <- affil_oa %>%
filter(!is.na(P_top10), n_frac_papers > 50) %>%
group_by(Period) %>%
mutate(across(c(P_top10, PP_top10, P_OA, PP_OA, impact_P), cut_quartiles))
plot_over_time <- function(df, indicator, y_var) {
df %>%
group_by(SDG_label, year, {{indicator}}) %>%
mutate(y_median = median({{y_var}}, na.rm = TRUE)) %>%
ggplot(aes(as_year(year), y_median, colour = {{indicator}})) +
geom_line() +
facet_wrap(vars(SDG_label)) +
scale_y_continuous(labels = scales::percent) +
guides(colour = guide_legend(reverse = TRUE)) +
labs(x = NULL)
}
pdata %>%
plot_over_time(indicator = P_top10, oa_share) +
labs(y = "Share of SDG papers which are OA")
pdata %>%
plot_over_time(indicator = PP_top10, oa_share) +
labs(y = "Share of SDG papers which are OA")
pdata %>%
plot_over_time(indicator = impact_P, oa_share) +
labs(y = "Share of SDG papers which are OA",
title = "# of publications per university vs SDG OA")
step1 <- papers %>%
left_join(author_paper_affiliations) %>%
left_join(affils) %>%
group_by(authorid, paperid) %>%
mutate(frac_value = 1/n())
## Joining, by = "paperid"
## Joining, by = c("citationcount", "affiliationid")
oa_type_per_affiliation_per_sdg <- step1 %>%
group_by(affiliationid, year, SDG_label) %>%
count(provider_cat, wt = frac_value)
totals <- step1 %>%
group_by(affiliationid, year, SDG_label) %>%
summarise(n_frac_papers = sum(frac_value, na.rm = TRUE))
oa_type_per_affiliation_per_sdg <- oa_type_per_affiliation_per_sdg %>%
left_join(totals)
## Joining, by = c("affiliationid", "year", "SDG_label")
oa_type_per_affiliation_per_sdg_local <- oa_type_per_affiliation_per_sdg %>%
filter(n_frac_papers > 50, !is.na(provider_cat)) %>%
collect()
oa_type_affiliation_leiden <- oa_type_per_affiliation_per_sdg_local %>%
mutate(affiliationid = as.numeric(affiliationid)) %>% # needed for merging
left_join(affil_leiden_key) %>%
left_join(leiden_small_local) %>%
# remove those institutions that are not in leiden ranking
filter(!is.na(University), year == as.numeric(last_year_of_period))
## Joining, by = "affiliationid"
## Joining, by = c("University", "Country")
oa_type_affiliation_leiden %>%
filter(provider_cat != "Not OA") %>%
mutate(oa_share = n/n_frac_papers) %>%
group_by(year, SDG_label, provider_cat) %>%
summarise(cor = cor(oa_share, P_top10, use = "pairwise.complete.obs")) %>%
ggplot(aes(year, cor, colour = provider_cat)) +
geom_line() +
geom_point() +
facet_wrap(vars(SDG_label)) +
theme(legend.position = "top") +
labs(title = "Correlation between share of papers in category & P_top10",
colour = NULL)
## `summarise()` has grouped output by 'year', 'SDG_label'. You can override using the `.groups` argument.
What does this mean, especially the green negative correlation?
oa_type_affiliation_leiden %>%
filter(provider_cat != "Not OA") %>%
mutate(oa_share = n/n_frac_papers) %>%
group_by(year, SDG_label, provider_cat) %>%
summarise(cor = cor(oa_share, PP_top10, use = "pairwise.complete.obs")) %>%
ggplot(aes(year, cor, colour = provider_cat)) +
geom_line() +
geom_point() +
facet_wrap(vars(SDG_label)) +
theme(legend.position = "top") +
labs(title = "Correlation between share of papers in category & PP_top10",
colour = NULL)
## `summarise()` has grouped output by 'year', 'SDG_label'. You can override using the `.groups` argument.
p <- oa_type_affiliation_leiden %>%
filter(provider_cat != "Not OA", year == 2018) %>%
group_by(year, SDG_label, affiliationid) %>%
mutate(oa_share = n/sum(n)) %>%
ggplot(aes(P_top10, oa_share, colour = provider_cat)) +
geom_point(size = .7, alpha = .4) +
scale_x_log10() +
geom_smooth() +
facet_wrap(vars(SDG_label)) +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "top") +
labs(y = "Share of papers in category", colour = NULL,
title = "Association between type of OA and institutional prestige (2018)")
p
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Further investigating the split between low prestige universities in SDG 3:
p <- oa_type_affiliation_leiden %>%
filter(provider_cat != "Not OA", year == 2018, SDG_label == "SDG_3") %>%
group_by(year, affiliationid) %>%
mutate(oa_share = n/sum(n)) %>%
ggplot(aes(P_top10, oa_share, colour = provider_cat, size = n_frac_papers)) +
geom_point(alpha = .4) +
scale_x_log10()
p <- p + aes(label = Country, text = University)
plotly::ggplotly(p)
Unclear where this split comes from. It is not related to size (in terms of number of publications), and seems also unrelated to country/continent.
Further things to look at:
gender_base <- papers %>%
select(paperid, SDG_label, year, is_oa) %>%
left_join(author_paper_affiliations_w_groups) %>%
left_join(author_metadata) %>%
select(paperid, SDG_label, year, author_position, authorid,
is_oa, paper_author_cat, gender) %>%
filter(!is.na(is_oa)) # restrict to unpaywall set only
## Joining, by = "paperid"
## Joining, by = "authorid"
gender_rate <- gender_base %>%
group_by(SDG_label, year, gender) %>%
count(is_oa) %>%
mutate(prop = n/sum(n)) %>%
collect()
Visalise oa rate per gender over time
gender_rate %>%
filter(is_oa, gender != "unknown", year < 2019) %>%
ggplot(aes(as_year(year), prop, colour = gender)) +
geom_line() +
geom_point() +
facet_wrap(vars(SDG_label))
This does not show big differences. At least in health, papers by women tend to have slightly higher OA rates.
But: what are we showing here? What is the y axis exactly? Should we maybe show how the number of female authors, or how the gender split in the authors of a given paper is related to OA? would make more sense than overall. Do this only for a specific year.
It will also be more productive to visualise this for first and last authors only.
Split for author positions
gender_rate_p_position <- gender_base %>%
filter(!is.na(is_oa), gender != "unknown") %>% # restrict to unpaywall set only
group_by(SDG_label, year, gender, author_position) %>%
count(is_oa) %>%
mutate(prop = n/sum(n)) %>%
collect()
gender_rate_p_position %>%
filter(is_oa, year < 2019) %>%
ggplot(aes(as_year(year), prop, colour = gender)) +
geom_line() +
geom_point() +
facet_grid(rows = vars(SDG_label),
cols = vars(author_position))
Also here there is no difference. What does this show? The proportion of papers which are OA per gender, SDG, author position and year.
Then, lets look at this in terms of papers
# only do this for papers where we have gender for all authors
gender_ratio_p_paper <- gender_base %>%
select(paperid, gender) %>%
group_by(paperid) %>%
mutate(all_genderized = sum(as.numeric(gender == "unknown")) == 0) %>%
filter(all_genderized) %>%
count(gender) %>%
mutate(prop = n/sum(n)) %>%
ungroup() %>%
filter(gender == "female") %>%
rename(prop_female = prop)
gender_oa_papers <- gender_ratio_p_paper %>%
left_join(papers) %>%
# it could be interesting though to look at this via regression -> then keep
# more variables
select(paperid, prop_female, year, is_oa, SDG_label) %>%
filter(!is.na(is_oa))
## Joining, by = "paperid"
gender_by_sdg <- gender_oa_papers %>%
group_by(year, is_oa, SDG_label) %>%
summarise(mean_prop_female = mean(prop_female)) %>%
collect()
gender_by_sdg %>%
ggplot(aes(as_year(year), mean_prop_female, colour = is_oa)) +
geom_line() +
geom_point() +
facet_wrap(vars(fix_sdg(SDG_label)), nrow = 2) +
scale_y_continuous(labels = function(x) scales::percent(x, 1),
limits = c(.45, .60)) +
labs(x = NULL, y = "Mean share of female authors on publications",
caption = "Including publications with at least two authors") +
theme_bw() +
theme(legend.position = c(.8, .2))
Very interesting: OA papers generally have a lower share of female authors. The reasons for this are probably related with general factors of OA publishing, in complex patterns (institutional prestige, funding, academic age, country differences, etc.). This is why it would be good to do a full model.
What to look at now: author position, i.e. whether authors with female first authors have more OA, or with male last authors, etc.
gender_oa_position <- gender_base %>%
filter(paper_author_cat != "single", author_position != "middle_author",
gender != "unknown") %>%
select(paperid, SDG_label, year, author_position, is_oa, gender) %>%
group_by(year, SDG_label, author_position, gender) %>%
count(is_oa) %>%
mutate(prop = n/sum(n)) %>%
filter(is_oa) %>%
rename(prop_oa = prop) %>%
collect()
pdata <- gender_oa_position %>%
select(-n) %>%
pivot_wider(names_from = gender, values_from = prop_oa) %>%
mutate(diff_from_male = female - male)
pdata %>%
filter(year < 2020) %>%
# filter(author_position == "first_author") %>%
ggplot(aes(as_year(year), diff_from_male, colour = fix_sdg(SDG_label))) +
geom_line() +
geom_point() +
facet_wrap(vars(author_position)) +
scale_y_continuous(labels = scales::percent) +
theme_bw() +
theme(legend.position = "top") +
labs(title = "Difference in OA publishing between genders",
x = NULL, y = "Difference from males", colour = NULL)
A value of x% means that in that year, the share of papers with women as first/last authors which were published OA is X percentage points higher, e.g. male first authors published 50% of their papers OA, females 52% for X = 2.
Why is the difference to the above plot so big? Differences in single authors? This view here only includes papers with at least two authors, the above includes all.
To do: the above (first one) but only with papers which are not single author papers
# only do this for papers where we have gender for all authors
gender_ratio_p_paper_no_single <- gender_base %>%
filter(paper_author_cat != "single") %>%
select(paperid, gender) %>%
group_by(paperid) %>%
mutate(all_genderized = sum(as.numeric(gender == "unknown")) == 0) %>%
filter(all_genderized) %>%
count(gender) %>%
mutate(prop = n/sum(n)) %>%
ungroup() %>%
filter(gender == "female") %>%
rename(prop_female = prop)
gender_oa_papers_no_single <- gender_ratio_p_paper_no_single %>%
left_join(papers) %>%
# it could be interesting though to look at this via regression -> then keep
# more variables
select(paperid, prop_female, year, is_oa, SDG_label) %>%
filter(!is.na(is_oa))
## Joining, by = "paperid"
gender_by_sdg_no_single <- gender_oa_papers_no_single %>%
group_by(year, is_oa, SDG_label) %>%
summarise(mean_prop_female = mean(prop_female)) %>%
collect()
gender_by_sdg_no_single %>%
ggplot(aes(as_year(year), mean_prop_female, colour = is_oa)) +
geom_line() +
geom_point() +
facet_wrap(vars(fix_sdg(SDG_label)), nrow = 2) +
scale_y_continuous(labels = function(x) scales::percent(x, 1)) +
labs(x = NULL, y = "Mean share of female authors on publications",
caption = "Including publications with at least two authors") +
theme_bw() +
theme(legend.position = c(.8, .2))
The effect is already smaller than above, but still there. So single author papers are not the only source of confounding here.
I also checked the above approach (with small differences) for middle authors, but there was not difference there. So where does this gap come from? Why do OA papers have lower rates of females as authors, although rates of females for first and last positions have little effect?
For the gender rate per paper, we only include papers for which we have gender for all authors. Maybe this distorts, i.e. reduces the set considerably?